- Data Visualization
- Overview
- Data visualization transforms complex data into clear, compelling visual representations that reveal patterns, trends, and insights for storytelling and decision-making.
- When to Use
- Exploratory data analysis and pattern discovery
- Communicating insights to stakeholders
- Comparing distributions and relationships
- Presenting findings in reports and dashboards
- Identifying outliers and anomalies visually
- Creating publication-ready charts and graphs
- Visualization Types
- Distributions
-
- Histograms, KDE, violin plots
- Relationships
-
- Scatter plots, line plots, heatmaps
- Comparisons
-
- Bar charts, box plots, ridge plots
- Compositions
-
- Pie charts, stacked bars, treemaps
- Temporal
-
- Line plots, area charts, time series
- Multivariate
- Pair plots, correlation heatmaps Design Principles Choose appropriate chart type for data Minimize ink-to-data ratio Use color purposefully Label clearly and completely Maintain consistent scales Consider accessibility Implementation with Python import pandas as pd import numpy as np import matplotlib . pyplot as plt import seaborn as sns from matplotlib . gridspec import GridSpec
Set style
sns . set_style ( "whitegrid" ) plt . rcParams [ 'figure.figsize' ] = ( 12 , 6 )
Generate sample data
np . random . seed ( 42 ) n = 500 data = pd . DataFrame ( { 'age' : np . random . uniform ( 20 , 70 , n ) , 'income' : np . random . exponential ( 50000 , n ) , 'education_years' : np . random . uniform ( 12 , 20 , n ) , 'category' : np . random . choice ( [ 'A' , 'B' , 'C' ] , n ) , 'region' : np . random . choice ( [ 'North' , 'South' , 'East' , 'West' ] , n ) , 'satisfaction' : np . random . uniform ( 1 , 5 , n ) , 'purchased' : np . random . choice ( [ 0 , 1 ] , n ) , } ) print ( data . head ( ) )
1. Distribution Plots
fig , axes = plt . subplots ( 2 , 2 , figsize = ( 12 , 8 ) )
Histogram
axes [ 0 , 0 ] . hist ( data [ 'age' ] , bins = 30 , color = 'skyblue' , edgecolor = 'black' ) axes [ 0 , 0 ] . set_title ( 'Age Distribution (Histogram)' ) axes [ 0 , 0 ] . set_xlabel ( 'Age' ) axes [ 0 , 0 ] . set_ylabel ( 'Frequency' )
KDE plot
data [ 'income' ] . plot ( kind = 'kde' , ax = axes [ 0 , 1 ] , color = 'green' , linewidth = 2 ) axes [ 0 , 1 ] . set_title ( 'Income Distribution (KDE)' ) axes [ 0 , 1 ] . set_xlabel ( 'Income' )
Box plot
sns . boxplot ( data = data , y = 'satisfaction' , x = 'category' , ax = axes [ 1 , 0 ] , palette = 'Set2' ) axes [ 1 , 0 ] . set_title ( 'Satisfaction by Category (Box Plot)' )
Violin plot
sns . violinplot ( data = data , y = 'age' , x = 'category' , ax = axes [ 1 , 1 ] , palette = 'Set2' ) axes [ 1 , 1 ] . set_title ( 'Age by Category (Violin Plot)' ) plt . tight_layout ( ) plt . show ( )
2. Relationship Plots
fig , axes = plt . subplots ( 2 , 2 , figsize = ( 12 , 8 ) )
Scatter plot
axes [ 0 , 0 ] . scatter ( data [ 'age' ] , data [ 'income' ] , alpha = 0.5 , s = 30 ) axes [ 0 , 0 ] . set_title ( 'Age vs Income (Scatter Plot)' ) axes [ 0 , 0 ] . set_xlabel ( 'Age' ) axes [ 0 , 0 ] . set_ylabel ( 'Income' )
Scatter with regression line
sns . regplot ( x = 'age' , y = 'income' , data = data , ax = axes [ 0 , 1 ] , scatter_kws = { 'alpha' : 0.5 } ) axes [ 0 , 1 ] . set_title ( 'Age vs Income (with Regression Line)' )
Joint plot alternative
ax_hex
axes [ 1 , 0 ] hexbin = ax_hex . hexbin ( data [ 'age' ] , data [ 'income' ] , gridsize = 15 , cmap = 'YlOrRd' ) ax_hex . set_title ( 'Age vs Income (Hex Bin)' ) ax_hex . set_xlabel ( 'Age' ) ax_hex . set_ylabel ( 'Income' )
Bubble plot
scatter
axes [ 1 , 1 ] . scatter ( data [ 'age' ] , data [ 'income' ] , s = data [ 'satisfaction' ] * 50 , c = data [ 'satisfaction' ] , cmap = 'viridis' , alpha = 0.6 , edgecolors = 'black' ) axes [ 1 , 1 ] . set_title ( 'Age vs Income (Bubble Plot)' ) axes [ 1 , 1 ] . set_xlabel ( 'Age' ) axes [ 1 , 1 ] . set_ylabel ( 'Income' ) plt . colorbar ( scatter , ax = axes [ 1 , 1 ] , label = 'Satisfaction' ) plt . tight_layout ( ) plt . show ( )
3. Comparison Plots
fig , axes = plt . subplots ( 2 , 2 , figsize = ( 12 , 8 ) )
Bar plot
category_counts
data [ 'category' ] . value_counts ( ) axes [ 0 , 0 ] . bar ( category_counts . index , category_counts . values , color = 'skyblue' , edgecolor = 'black' ) axes [ 0 , 0 ] . set_title ( 'Category Distribution (Bar Chart)' ) axes [ 0 , 0 ] . set_ylabel ( 'Count' )
Grouped bar plot
grouped_data
data . groupby ( [ 'category' , 'region' ] ) . size ( ) . unstack ( ) grouped_data . plot ( kind = 'bar' , ax = axes [ 0 , 1 ] , edgecolor = 'black' ) axes [ 0 , 1 ] . set_title ( 'Category by Region (Grouped Bar)' ) axes [ 0 , 1 ] . set_ylabel ( 'Count' ) axes [ 0 , 1 ] . legend ( title = 'Region' )
Stacked bar plot
grouped_data . plot ( kind = 'bar' , stacked = True , ax = axes [ 1 , 0 ] , edgecolor = 'black' ) axes [ 1 , 0 ] . set_title ( 'Category by Region (Stacked Bar)' ) axes [ 1 , 0 ] . set_ylabel ( 'Count' )
Horizontal bar plot
region_counts
data [ 'region' ] . value_counts ( ) axes [ 1 , 1 ] . barh ( region_counts . index , region_counts . values , color = 'lightcoral' , edgecolor = 'black' ) axes [ 1 , 1 ] . set_title ( 'Region Distribution (Horizontal Bar)' ) axes [ 1 , 1 ] . set_xlabel ( 'Count' ) plt . tight_layout ( ) plt . show ( )
4. Correlation and Heatmaps
numeric_cols
data [ [ 'age' , 'income' , 'education_years' , 'satisfaction' ] ] . corr ( ) fig , axes = plt . subplots ( 1 , 2 , figsize = ( 14 , 5 ) )
Correlation heatmap
sns . heatmap ( numeric_cols , annot = True , fmt = '.2f' , cmap = 'coolwarm' , center = 0 , square = True , ax = axes [ 0 ] , cbar_kws = { 'label' : 'Correlation' } ) axes [ 0 ] . set_title ( 'Correlation Matrix Heatmap' )
Clustermap alternative
from scipy . cluster . hierarchy import dendrogram , linkage from scipy . spatial . distance import pdist , squareform
Create a simpler heatmap for category averages
category_avg
data . groupby ( 'category' ) [ [ 'age' , 'income' , 'education_years' , 'satisfaction' ] ] . mean ( ) sns . heatmap ( category_avg . T , annot = True , fmt = '.1f' , cmap = 'YlGnBu' , ax = axes [ 1 ] , cbar_kws = { 'label' : 'Average Value' } ) axes [ 1 ] . set_title ( 'Average Values by Category' ) plt . tight_layout ( ) plt . show ( )
5. Pair Plot
pair_cols
[ 'age' , 'income' , 'education_years' , 'satisfaction' ] plt . figure ( figsize = ( 12 , 10 ) ) pair_plot = sns . pairplot ( data [ pair_cols ] , diag_kind = 'hist' , corner = False ) pair_plot . fig . suptitle ( 'Pair Plot Matrix' , y = 1.00 ) plt . show ( )
6. Multi-dimensional Visualization
fig
plt . figure ( figsize = ( 14 , 6 ) ) gs = GridSpec ( 2 , 3 , figure = fig )
Subplots with different aspects
ax1
fig . add_subplot ( gs [ 0 , 0 ] ) ax1 . scatter ( data [ 'age' ] , data [ 'income' ] , c = data [ 'satisfaction' ] , cmap = 'viridis' , alpha = 0.6 ) ax1 . set_title ( 'Age vs Income (colored by Satisfaction)' ) ax1 . set_xlabel ( 'Age' ) ax1 . set_ylabel ( 'Income' ) ax2 = fig . add_subplot ( gs [ 0 , 1 ] ) for cat in data [ 'category' ] . unique ( ) : subset = data [ data [ 'category' ] == cat ] ax2 . scatter ( subset [ 'age' ] , subset [ 'income' ] , label = cat , alpha = 0.6 ) ax2 . set_title ( 'Age vs Income (by Category)' ) ax2 . set_xlabel ( 'Age' ) ax2 . set_ylabel ( 'Income' ) ax2 . legend ( ) ax3 = fig . add_subplot ( gs [ 0 , 2 ] ) sns . boxplot ( data = data , x = 'region' , y = 'income' , ax = ax3 , palette = 'Set2' ) ax3 . set_title ( 'Income Distribution by Region' ) ax4 = fig . add_subplot ( gs [ 1 , 0 ] ) data . groupby ( 'category' ) [ 'satisfaction' ] . mean ( ) . plot ( kind = 'bar' , ax = ax4 , color = 'skyblue' , edgecolor = 'black' ) ax4 . set_title ( 'Average Satisfaction by Category' ) ax4 . set_ylabel ( 'Satisfaction' ) ax4 . set_xlabel ( 'Category' ) ax5 = fig . add_subplot ( gs [ 1 , 1 : ] ) region_category = pd . crosstab ( data [ 'region' ] , data [ 'category' ] ) region_category . plot ( kind = 'bar' , ax = ax5 , edgecolor = 'black' ) ax5 . set_title ( 'Region vs Category Distribution' ) ax5 . set_ylabel ( 'Count' ) ax5 . set_xlabel ( 'Region' ) ax5 . legend ( title = 'Category' ) plt . tight_layout ( ) plt . show ( )
7. Time Series Visualization (if temporal data)
dates
pd . date_range ( '2023-01-01' , periods = len ( data ) ) data [ 'date' ] = dates data [ 'cumulative_income' ] = data [ 'income' ] . cumsum ( ) fig , axes = plt . subplots ( 2 , 1 , figsize = ( 12 , 8 ) )
Line plot
axes [ 0 ] . plot ( data [ 'date' ] , data [ 'income' ] , linewidth = 1 , alpha = 0.7 , label = 'Income' ) axes [ 0 ] . fill_between ( data [ 'date' ] , data [ 'income' ] , alpha = 0.3 ) axes [ 0 ] . set_title ( 'Income Over Time' ) axes [ 0 ] . set_ylabel ( 'Income' ) axes [ 0 ] . grid ( True , alpha = 0.3 ) axes [ 0 ] . legend ( )
Area plot
axes [ 1 ] . plot ( data [ 'date' ] , data [ 'cumulative_income' ] , linewidth = 2 , color = 'green' ) axes [ 1 ] . fill_between ( data [ 'date' ] , data [ 'cumulative_income' ] , alpha = 0.3 , color = 'green' ) axes [ 1 ] . set_title ( 'Cumulative Income Over Time' ) axes [ 1 ] . set_ylabel ( 'Cumulative Income' ) axes [ 1 ] . set_xlabel ( 'Date' ) axes [ 1 ] . grid ( True , alpha = 0.3 ) plt . tight_layout ( ) plt . show ( )
8. Composition Visualization
fig , axes = plt . subplots ( 1 , 2 , figsize = ( 12 , 5 ) )
Pie chart
category_counts
data [ 'category' ] . value_counts ( ) colors = [ '#ff9999' , '#66b3ff' , '#99ff99' ] axes [ 0 ] . pie ( category_counts . values , labels = category_counts . index , autopct = '%1.1f%%' , colors = colors , startangle = 90 ) axes [ 0 ] . set_title ( 'Category Distribution (Pie Chart)' )
Donut chart
axes [ 1 ] . pie ( category_counts . values , labels = category_counts . index , autopct = '%1.1f%%' , colors = colors , startangle = 90 , wedgeprops = dict ( width = 0.5 , edgecolor = 'white' ) ) axes [ 1 ] . set_title ( 'Category Distribution (Donut Chart)' ) plt . tight_layout ( ) plt . show ( )
9. Dashboard-style Visualization
fig
plt . figure ( figsize = ( 16 , 10 ) ) gs = GridSpec ( 3 , 3 , figure = fig , hspace = 0.3 , wspace = 0.3 )
Key metrics
ax_metric
fig . add_subplot ( gs [ 0 , : ] ) ax_metric . axis ( 'off' ) metrics_text = f""" Average Age: { data [ 'age' ] . mean ( ) : .1f } | Average Income: $ { data [ 'income' ] . mean ( ) : .0f } | Average Satisfaction: { data [ 'satisfaction' ] . mean ( ) : .2f } | Purchase Rate: { ( data [ 'purchased' ] . mean ( ) * 100 ) : .1f } % """ ax_metric . text ( 0.5 , 0.5 , metrics_text , ha = 'center' , va = 'center' , fontsize = 12 , bbox = dict ( boxstyle = 'round' , facecolor = 'lightblue' , alpha = 0.7 ) )
Subplots
ax1
- fig
- .
- add_subplot
- (
- gs
- [
- 1
- ,
- 0
- ]
- )
- data
- [
- 'age'
- ]
- .
- hist
- (
- bins
- =
- 20
- ,
- ax
- =
- ax1
- ,
- color
- =
- 'skyblue'
- ,
- edgecolor
- =
- 'black'
- )
- ax1
- .
- set_title
- (
- 'Age Distribution'
- )
- ax2
- =
- fig
- .
- add_subplot
- (
- gs
- [
- 1
- ,
- 1
- ]
- )
- category_counts
- .
- plot
- (
- kind
- =
- 'bar'
- ,
- ax
- =
- ax2
- ,
- color
- =
- 'lightcoral'
- ,
- edgecolor
- =
- 'black'
- )
- ax2
- .
- set_title
- (
- 'Category Counts'
- )
- ax3
- =
- fig
- .
- add_subplot
- (
- gs
- [
- 1
- ,
- 2
- ]
- )
- data
- .
- groupby
- (
- 'category'
- )
- [
- 'satisfaction'
- ]
- .
- mean
- (
- )
- .
- plot
- (
- kind
- =
- 'bar'
- ,
- ax
- =
- ax3
- ,
- color
- =
- 'lightgreen'
- ,
- edgecolor
- =
- 'black'
- )
- ax3
- .
- set_title
- (
- 'Avg Satisfaction by Category'
- )
- ax4
- =
- fig
- .
- add_subplot
- (
- gs
- [
- 2
- ,
- :
- 2
- ]
- )
- sns
- .
- boxplot
- (
- data
- =
- data
- ,
- x
- =
- 'region'
- ,
- y
- =
- 'income'
- ,
- ax
- =
- ax4
- ,
- palette
- =
- 'Set2'
- )
- ax4
- .
- set_title
- (
- 'Income by Region'
- )
- ax5
- =
- fig
- .
- add_subplot
- (
- gs
- [
- 2
- ,
- 2
- ]
- )
- data
- [
- 'satisfaction'
- ]
- .
- value_counts
- (
- )
- .
- sort_index
- (
- )
- .
- plot
- (
- kind
- =
- 'bar'
- ,
- ax
- =
- ax5
- ,
- color
- =
- 'orange'
- ,
- edgecolor
- =
- 'black'
- )
- ax5
- .
- set_title
- (
- 'Satisfaction Scores'
- )
- plt
- .
- suptitle
- (
- 'Data Analytics Dashboard'
- ,
- fontsize
- =
- 16
- ,
- fontweight
- =
- 'bold'
- ,
- y
- =
- 0.995
- )
- plt
- .
- show
- (
- )
- (
- "Visualization examples completed!"
- )
- Visualization Best Practices
- Choose chart type based on data type and question
- Use consistent color schemes
- Label axes clearly with units
- Include title and legend
- Avoid 3D charts when 2D suffices
- Make fonts large and readable
- Consider colorblind-friendly palettes
- Common Chart Types
- Bar charts
-
- Categorical comparisons
- Line plots
-
- Trends over time
- Scatter plots
-
- Relationships between variables
- Histograms
-
- Distributions
- Heatmaps
-
- Matrix data
- Box plots
- Distribution with quartiles Deliverables Exploratory visualizations Publication-ready charts Interactive dashboard mockups Statistical plots with annotations Trend analysis visualizations Comparative analysis charts Summary infographics